### Configuration
################################################################################
### R Options
nodeInfo = system(paste0("scontrol show node ", Sys.info()["nodename"]), intern = TRUE)
cpus = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "cpu="))[2], ","))[1])
memInKB = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "mem="))[2], "M,"))[1]) * 1024^2
options(stringsAsFactors=FALSE,
dplyr.summarise.inform=FALSE,
future.globals.maxSize=min(memInKB, 20 * 1024^3),
mc.cores=min(cpus,1),
future.fork.enable=TRUE, future.plan="multicore",
future.rng.onMisuse="ignore")
### Fuctions
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_analysis.R",
"functions_biomart.R",
"functions_degs.R",
"functions_io.R",
"functions_plotting.R",
"functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))
### Debugging mode
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions
param$debugging_mode = "default_debugging"
switch (param$debugging_mode,
default_debugging=on_error_default_debugging(),
terminal_debugger=on_error_start_terminal_debugger(),
print_traceback=on_error_just_print_traceback(),
on_error_default_debugging())### Rmarkdown configuration
################################################################################
### Create output directories
if (!file.exists(file.path(param$path_out, "figures"))) dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
if (!file.exists(file.path(param$path_out, "data"))) dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)
### R Options
options(citation_format="pandoc",
knitr.table.format="html",
kableExtra_view_html=TRUE)
### Knitr default options
knitr::opts_chunk$set(echo=TRUE, # output code
cache=FALSE, # do not cache results
message=TRUE, # show messages
warning=TRUE, # show warnings
results = "hold", # show results after running whole chunk
tidy=FALSE, # do not auto-tidy-up code
fig.width=10, # default fig width in inches
class.source='fold-hide', # by default collapse code blocks
dev=c('png', 'svg', 'tiff'), # create figures in png, tiff, and svg; the first device (png) will be used for HTML output
dev.args=list(png = list(type="cairo"), # png: use cairo - works on cluster
tiff = list(type="cairo", compression = 'zip')), # tiff: use cairo - works on cluster, transparent background, compression type "zip" is ‘deflate (Adobe-style)’
dpi=300, # figure resolution
fig.retina=2, # retina multiplier
fig.path=paste0(file.path(param$path_out, "figures"), "/") # Path for figures in png and pdf format (trailing "/" is needed)
)
### Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)
### Required libraries
library(magrittr)
### Set the bibliographic environment
# Clear previous bibliography
knitcitations::cleanbib()
# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
bib_ref = knitcitations::read.bibtex(file.path(param$path_to_git,"assets","sc_analysis_references.bib"))
invisible(knitcitations::citep(bib_ref))
### Figure heights
# high figures
fig_large_height = 8
# Single figure, landscape
fig_standard_height = 4
# Two plots alongside (e.g. umaps)
fig_standard2_height = 5
# Three plots alongside (e.g. umaps)
fig_standard3_height = 4
# Four plots alongside (e.g. umaps)
fig_standard4_height = 2.5
# Four plots 2x2 (e.g. umaps)
fig_patchwork4_height = fig_standard2_height * 2
# Six plots 2x3 (e.g. umaps)
fig_patchwork6_height = fig_standard3_height * 2
# Eight plots 4x2 (e.g. umaps)
fig_patchwork8_height = fig_standard4_height * 2
# Twelve plots 4x3 (e.g. umaps)
fig_patchwork12_height = fig_standard4_height * 3
# Sixteen plots 4x4 (e.g. umaps)
fig_patchwork16_height = fig_standard4_height * 4# Load renv
renv::load(file.path(param$path_to_git,"env/basic"))
# Required libraries
library(Seurat)
library(patchwork)
library(magrittr)
library(ggplot2)
library(ggsci)
library(pheatmap)Single cell transcriptomes can be difficult to annotate without extensive knowledge of the underlying biology. Hence, the biological knowledge (defined marker genes and cluster identities) can be propagated from an previously annotated dataset to the test dataset in an automated manner and aid in cluster identification.
Here, we project reference data onto a query object to annotate the cells of the query datasets by cell type label transfer and projecting query cells onto reference UMAPs as described in the Seurat vignette ‘Mapping and annotating query datasets’ (https://satijalab.org/seurat/articles/integration_mapping.html). The reference and the query datasets have to be available as S4 class Seurat objects. While the reference object has to contain normalized data and UMAP reduction, correction of the underlying raw query data is not required.
Be aware, that the output is highly impacted by the quality and contend of the reference dataset!
### Read rds object
################################################################################
### Load Seurat S4 objects
# Test if file is defined
if (is.null(param$data)) {
message("Dataset is not specified")
} else {
# Test if file exists
if (file.exists(param$data)) {
# Read object
message(paste0("Load dataset:", param$data))
sc = base::readRDS(param$data)
# Transfer original params to loaded object
if ("parameters" %in% list_names(sc@misc[])) {
# Retrieve previous parameter settings
orig_param = sc@misc$parameters
if ("SCT" %in% names(sc@assays)) {
if ("scale.data" %in% Layers(sc[["SCT"]])) {
orig_param$norm = "SCT"
} else {
orig_param$norm = "RNA"
}
} else {
orig_param$norm = "RNA"
}
# Keep some parameter settings from object and project defined
orig_param_keep = orig_param[c("annot_version", "species")]
basic_param_keep = param[c("path_to_git", "scriptname", "author", "project_id", "data", "path_out", "file_annot", "file_cc_genes")]
# Integrate parameter
param = modifyList(x = param, val = orig_param)
param = modifyList(x = param, val = basic_param_keep)
param = modifyList(x = param, val = param_advset)
param = modifyList(x = param, val = orig_param_keep)
}
### Set colors
# Set sample colors based on orig.ident
if (!is.null(sc@meta.data[["orig.ident"]])) {
if (length(unique(sc@meta.data[["orig.ident"]]))!=length(param$col_samples)) {
message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
sample_colours_out = suppressWarnings(SetSampleColours(sc, param$col_palette_samples))
sc = sample_colours_out[[1]]
param$col_samples = sample_colours_out[[2]]
}
}
# Set colors for clusters based on seurat_clusters
if (!is.null(sc@meta.data[["seurat_clusters"]])) {
if (length(unique(sc@meta.data[["seurat_clusters"]]))!=length(param$col_clusters)) {
message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
cluster_colours_out = suppressWarnings(SetClusterColours(sc, param$col_palette_clusters))
sc = cluster_colours_out[[1]]
param$col_clusters = cluster_colours_out[[2]]
}
}
# Set colors for cell types based on annotation
if (!is.null(sc@meta.data[["annotation"]])) {
if (length(unique(sc@meta.data[["annotation"]]))!=length(param$col_annotation)) {
message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
annotation_colours_out = suppressWarnings(SetAnnotationColours(sc, param$col_palette_annotation))
sc = annotation_colours_out[[1]]
param$col_annotation = annotation_colours_out[[2]]
}
}
sc
} else {
message("Dataset does not exist")
}
}×
(Message)
Load
dataset:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/cluster_analysis/data/sc.rds
### Load reference Seurat S4 objects if specified
# Test if file is defined
if (!is.null(param$refdata)) {
# Test if file exists
if (file.exists(param$refdata)) {
# Read object
message(paste0("Load dataset:", param$refdata))
scR = base::readRDS(param$refdata)
# Transfer original params to loaded object
if ("parameters" %in% list_names(scR@misc[])) {
orig_paramR = scR@misc$parameters
if (!is.null(orig_paramR$col_samples)) {
param$col_samples_ref = orig_paramR$col_samples
}
if (!is.null(orig_paramR$col_clusters)) {
param$col_clusters_ref = orig_paramR$col_clusters
}
if (!is.null(orig_paramR$col_annotation)) {
param$col_annotation_ref = orig_paramR$col_annotation
}
param = modifyList(x = param, val = param_advset)
}
### Set colors
# Set sample colors based on orig.ident
if (!is.null(scR@meta.data[["orig.ident"]])) {
if (length(unique(scR@meta.data[["orig.ident"]]))!=length(param$col_samples_ref)) {
message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
sample_colours_out = suppressWarnings(SetSampleColours(scR, param$col_palette_samples))
scR = sample_colours_out[[1]]
param$col_samples_ref = sample_colours_out[[2]]
}
}
# Set colors for clusters based on seurat_clusters
if (!is.null(scR@meta.data[["seurat_clusters"]])) {
if (length(unique(scR@meta.data[["seurat_clusters"]]))!=length(param$col_clusters_ref)) {
message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
cluster_colours_out = suppressWarnings(SetClusterColours(scR, param$col_palette_clusters))
scR = cluster_colours_out[[1]]
param$col_clusters_ref = cluster_colours_out[[2]]
}
}
# Set colors for cell types based on annotation
if (!is.null(scR@meta.data[["annotation"]])) {
if (length(unique(scR@meta.data[["annotation"]]))!=length(param$col_annotation_ref)) {
message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
annotation_colours_out = suppressWarnings(SetAnnotationColours(scR, param$col_palette_annotation))
scR = annotation_colours_out[[1]]
param$col_annotation_ref = annotation_colours_out[[2]]
}
}
scR
} else {
message("Reference dataset does not exist")
}
} ×
(Message)
Load
dataset:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata_2/cluster_analysis/data/sc.rds
## An object of class Seurat
## 6439 features across 1078 samples within 1 assay
## Active assay: RNA (6439 features, 3000 variable features)
## 3 layers present: data, counts, scale.data
## 2 dimensional reductions calculated: pca, umap
## An object of class Seurat
## 14043 features across 1078 samples within 1 assay
## Active assay: RNA (14043 features, 3000 variable features)
## 3 layers present: counts, data, scale.data
## 2 dimensional reductions calculated: pca, umap
# Expand colors for cell type annotation to to both datasets
param$col_annotation_all = c(param$col_annotation_ref, 'NA' = "#BA6338FF")Cell type classification was performed by projecting a reference dataset onto a query object by transfer of cell type labels from the reference to cells of the query. In the next step, the query data are then project onto the UMAP structure of the reference as described in the vignette https://satijalab.org/seurat/articles/integration_mapping.
The reference-mapping of the query dataset helps us identify cell types. However, the output, means the annotation of cells of the query dataset, is highly impacted by the quality and content of the reference dataset. If there are cell types that are present in the query dataset that are not represented in the reference, they will project to the most similar cells in the reference dataset. Hence, we perform assessment of the prediction score that each cell obtains representing the strength of the prediction. Each prediction is assigned a score between 0 and 1. We test different thresholds.
# From https://satijalab.org/seurat/articles/integration_mapping.html
### Projection of query data onto the UMAP structure of the reference does only work if model is known.
# If reference object does not contain an umap dimensional reduction object
if (is.null(scR@reductions$umap)) {
scR = Seurat::RunUMAP(scR, dims=1:30, verbose=FALSE, umap.method="uwot", n.neighbors=30, return.model = TRUE)
}
# If RunUMAP() was run without return.model = TRUE, set the model manually with information extracted from the seurat object
if (is.null(scR[["umap"]]@misc$model)) {
# set embeddings
scR[["umap"]] <- CreateDimReducObject(embeddings = scR[["umap"]]@cell.embeddings, key = "UMAP_")
# set UMAP models
umap.model <- list()
umap.model$n_epochs <- 200
umap.model$alpha <-1
umap.model$method <- "umap"
umap.model$negative_sample_rate <- scR@commands$RunUMAP.RNA.pca$negative.sample.rate
umap.model$gamma <- 1
umap.model$approx_pow <- FALSE
umap.model$metric$cosine$ndim <- length(scR@commands$RunUMAP.RNA.pca$dims)
umap.model$min_dist = scR@commands$RunUMAP.RNA.pca$min.dist
umap.model$spread = scR@commands$RunUMAP.RNA.pca$spread
umap.model$n_neighbors = scR@commands$RunUMAP.RNA.pca$n.neighbors
umap.model$pca_models = list()
umap.model$spread = scR@commands$RunUMAP.RNA.pca$uwot.sgd
umap.model$embedding <- scR[["umap"]]@cell.embeddings
ab_param <- uwot:::find_ab_params(spread = scR@commands$RunUMAP.RNA.pca$spread, min_dist = scR@commands$RunUMAP.RNA.pca$min.dist)
umap.model$a <- ab_param["a"]
umap.model$b <- ab_param["b"]
scR[["umap"]]@misc$model <- umap.model
}
# Find anchors between reference and query
DefaultAssay(scR) <- "RNA"
DefaultAssay(sc) <- "RNA"
sc.anchors = FindTransferAnchors(reference = scR, query = sc, dims = 1:30, reference.reduction = "pca")
# MapQuery(), a wrapper of TransferData(), IntegrateEmbeddings(), and ProjectUMAP(), transfers cell type labels and impute the ADT values (TransferData()) and projects the query data onto the UMAP structure of the reference (IntegrateEmbeddings() and ProjectUMAP()). Predicted cell types are saved in a predicted.celltype column.
sc = MapQuery(anchorset = sc.anchors, reference = scR, query = sc, refdata = list(celltype = param$celltype), reference.reduction = "pca", reduction.model = "umap")
# Transfer refdata annotation to an additional column of the reference dataset for later visualisation.
scR$annot_cells = scR[[param$celltype]]# Extract numbers predicted celltypes
pred_cells_table = data.frame(table(sc$predicted.celltype))
colnames(pred_cells_table) = c("Cell_type", "0")
# Filter for predicted.celltype.score
pred_score_thresholod = seq(0.1, 1, 0.1)
for (i in pred_score_thresholod) {
sc_subset <- subset(sc, subset = predicted.celltype.score > i)
sc$annot_cells = NULL
sc$annot_cells = sc_subset$predicted.celltype
# Extract numbers predicted celltypes
pred_celltypes_filtered_tbl = data.frame(table(sc$annot_cells))
colnames(pred_celltypes_filtered_tbl) = c("Cell_type", i)
# Combine and print table
pred_cells_table = merge(pred_cells_table, pred_celltypes_filtered_tbl, by.x = "Cell_type", all.x = TRUE)
}
knitr::kable(pred_cells_table, align="l", caption="Number of annotated cells for each prediction score threshold") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) | Cell_type | 0 | 0.1 | 0.2 | 0.3 | 0.4 | 0.5 | 0.6 | 0.7 | 0.8 | 0.9 | 1 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| B-cells | 185 | 185 | 185 | 185 | 185 | 185 | 185 | 185 | 185 | 184 | 60 |
| CD4+ T-cells | 353 | 353 | 353 | 353 | 353 | 353 | 352 | 347 | 339 | 328 | 60 |
| CD8+ T-cells | 128 | 128 | 128 | 128 | 127 | 126 | 125 | 122 | 122 | 115 | 28 |
| Monocytes | 358 | 358 | 358 | 358 | 358 | 358 | 358 | 356 | 355 | 350 | 109 |
| NK cells | 54 | 54 | 54 | 54 | 54 | 54 | 54 | 53 | 52 | 50 | 19 |
tbl = tidyr::pivot_longer(pred_cells_table, cols = 2:12, cols_vary = "fastest", names_to = "Threshold", values_to = "Number")
tbl <- tbl %>% replace(is.na(.), 0)
p = ggplot(tbl) +
geom_point(aes(x = Threshold, y = Number, color = Cell_type)) +
theme_bw() +
scale_color_manual(values=param$col_annotation_ref)
pIn the following the prediction score threshold of 0.9 was used to exclude unlikely cell type annotations.
# Filter by sub-setting for threshold
sc_subset <- subset(sc, subset = predicted.celltype.score > param$predicted_score_threshold)
pred_celltypes = data.frame(table(sc_subset$predicted.celltype))
pred_celltypes = dplyr::mutate(pred_celltypes, Percent = (Freq/sum(pred_celltypes$Freq)*100))
# Filter by subsetting for cells belonging to a specific cell type annotation
filtered_pred_celltypes = dplyr::filter(pred_celltypes, Percent > param$percent_predicted_cells_threshold)
celltype_filter = as.vector(filtered_pred_celltypes$Var1)
Idents(sc_subset) = "predicted.celltype"
sc_subset <- subset(sc_subset, idents = celltype_filter)
# Transfer subsetting results into sc object as annot_cells variable
sc$annot_cells = NULL
sc$annot_cells = sc_subset$predicted.celltype
# Print table
annot_cells = data.frame(table(sc$annot_cells))
colnames(annot_cells) = c("Cell_type", "Number")
unassiged = data.frame("unassigend",(length(colnames(sc))-sum(table(sc$annot_cells))))
colnames(unassiged) = c("Cell_type", "Number")
annot_cells = rbind(annot_cells, unassiged)
annot_cells = dplyr::mutate(annot_cells, Percent = (Number/sum(annot_cells$Number)*100))
knitr::kable(annot_cells, align="l", caption="Number and percentage of annotated cells") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) | Cell_type | Number | Percent |
|---|---|---|
| B-cells | 184 | 17.068646 |
| CD4+ T-cells | 328 | 30.426716 |
| CD8+ T-cells | 115 | 10.667903 |
| Monocytes | 350 | 32.467532 |
| NK cells | 50 | 4.638219 |
| unassigend | 51 | 4.730983 |
fig_annot_celltypes = ceiling(length(pred_celltypes)/3) * 4Visualization of the prediction score of annotated cell types in the query dataset.
# plot cell type prediction scores
DefaultAssay(sc) <- 'prediction.score.celltype'
p <- FeaturePlot(sc, reduction = "ref.umap", features = celltype_filter, ncol = 3,
cols = c(param$col_bg, param$col))
p# plot cell type prediction scores
p <- FeaturePlot(sc, reduction = param$reduction, features = celltype_filter, ncol = 3,
cols = c(param$col_bg, param$col))
pDefaultAssay(sc) <- "RNA"Visualization query cells projected into the UMAP visualization defined by the reference alongside the reference.
p1 = Seurat::DimPlot(scR, reduction = "umap", group.by = "annot_cells", pt.size=param$pt_size, cols = param$col_annotation_all, label = TRUE, label.size = 3, repel = TRUE, shuffle = TRUE) +
AddStyle(title="Reference dataset with annotations", xlab = "UMAP 1", ylab = "UMAP 2") +
theme(title = element_text(size = 10)) +
NoGrid()
p2 = Seurat::DimPlot(sc, reduction = "ref.umap", group.by = "annot_cells", pt.size=param$pt_size, cols = param$col_annotation_all, label = TRUE, label.size = 3, repel = TRUE, shuffle = TRUE) +
AddStyle(title="Cells of query dataset projected on reference UMAP", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
theme(title = element_text(size = 10)) +
NoGrid()
p = p1 + p2
pVisualization annotated query cells in original UMAP visualization alongside the original query annotation.
p1 = Seurat::DimPlot(sc, reduction = param$reduction, group.by = param$celltype, pt.size=param$pt_size, label = TRUE, cols = param$col_annotation, label.size = 3, repel = TRUE, shuffle = TRUE) +
AddStyle(title="Query dataset with original annotation", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
theme(title = element_text(size = 10)) +
NoGrid()
p2 = Seurat::DimPlot(sc, reduction = param$reduction, group.by = "annot_cells", pt.size=param$pt_size, cols = param$col_annotation_all, label = TRUE, label.size = 3, repel = TRUE, shuffle = TRUE) +
AddStyle(title="Query dataset with transferred labels", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
theme(title = element_text(size = 10)) +
NoGrid()
p = p1 + p2
p
Above, we have projected the query cells onto to the reference-derived UMAP. Keeping a consistent visualization can assist with the interpretation of new datasets. However, the limitation of this approach lies also exactly within the predefined cell types and reference-derived UMAP and can mask the presence of new cell types in the query which may be of interest. Hence, if the query dataset contains cell types that are not present in the reference, computing a ‘de novo’ visualization is an important step in interpreting the dataset.
This part follows the example of the vignette https://satijalab.org/seurat/articles/multimodal_reference_mapping.
# Slim down a Seurat object to reduces the amount of data in the subsequent merge.
reference = DietSeurat(scR, counts = FALSE, dimreducs = "pca")
query = DietSeurat(sc, counts = FALSE, dimreducs = "ref.pca")
# Make cell names unique
colnames(reference) = paste("reference", colnames(reference), sep = "_")
colnames(query) = paste("query", colnames(query), sep = "_")
# Select equal amount of pca of both datasets
red_pca = reference[["pca"]][[,1:30]]
reference[["red_pca"]] = CreateDimReducObject(embeddings = red_pca, key = "PC_", assay = DefaultAssay(reference))
red_pca = query[["ref.pca"]][[,1:30]]
query[["red_pca"]] = CreateDimReducObject(embeddings = red_pca, key = "PC_", assay = DefaultAssay(query))
# Merge both datasets
reference$id <- 'reference'
query$id <- 'query'
refquery = base::merge(reference, query)
refquery[["pca"]] = base::merge(reference[["red_pca"]], query[["red_pca"]])
refquery = Seurat::RunUMAP(refquery, reduction = 'pca', dims=1:param$pc_n, verbose=FALSE, umap.method="uwot", n.neighbors=param$umap_k)
# Set sample colors
param$col_samples = c('reference' = "#5DB1DDFF", 'query' = "#802268FF")p1 = DimPlot(refquery, group.by = "id", shuffle = TRUE, pt.size=param$pt_size, cols = param$col_samples, label = TRUE, label.size = 3, repel = TRUE) +
AddStyle(title="Groups", xlab = "UMAP 1", ylab = "UMAP 2") +
theme(title = element_text(size = 10)) +
NoLegend() +
NoGrid()
p2 = DimPlot(refquery, group.by = "annot_cells", shuffle = TRUE, pt.size=param$pt_size, cols = param$col_annotation_all, label = TRUE, label.size = 3, repel = TRUE) +
AddStyle(title="Annotation with reference dataset", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
theme(title = element_text(size = 10)) +
NoGrid()
p = p1 + p2
pfacet_names <- c(`TRUE` = "Query", `FALSE` = "Reference")
p = DimPlot(refquery, group.by = "annot_cells", shuffle = TRUE, split.by = "id", pt.size=param$pt_size, cols = param$col_annotation_all, label = TRUE, label.size = 3, repel = TRUE) +
AddStyle(title="Annotation with reference dataset", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
theme_classic() +
theme(title = element_text(size = 10)) +
NoGrid() +
facet_wrap(~id %in% c("query"), drop = TRUE, labeller = as_labeller(facet_names))
pHighlight common cell types B-cells, CD4+ T-cells, CD8+ T-cells, Monocytes, NK cells.
Idents(reference) = "annot_cells"
sc_subset_ref <- subset(reference, idents = celltype_filter)
Idents(query) = "annot_cells"
sc_subset_query <- subset(query, idents = celltype_filter)
p1 = DimPlot(refquery, group.by = param$celltype, shuffle = TRUE, pt.size=param$pt_size, label = TRUE, label.size = 3, repel = TRUE, cells = colnames(reference), cells.highlight = colnames(sc_subset_ref), sizes.highlight = param$pt_size) +
AddStyle(title="Reference", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
theme(title = element_text(size = 10)) +
scale_color_manual(labels = c("Unique cell types","Common cell types"), values = c(param$col_bg, param$col)) +
NoLegend() +
NoGrid()
p2 = DimPlot(refquery, group.by = param$celltype, shuffle = TRUE, pt.size=param$pt_size, label = TRUE, label.size = 3, repel = TRUE, cells = colnames(query), cells.highlight = colnames(sc_subset_query), sizes.highlight = param$pt_size) +
AddStyle(title="Query", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
theme(title = element_text(size = 10)) +
scale_color_manual(labels = c("Unique cell types","Common cell types"), values = c(param$col_bg, param$col)) +
NoGrid()
p = p1 + p2
pThis section of the report visualizes the above calculated cell cycle scores. Scoring is based on the strategy described in Tirosh et al. (2016)
p1 = DimPlot(refquery, group.by = param$celltype, shuffle = TRUE, pt.size=param$pt_size, label = TRUE, cols = param$col_annotation, label.size = 3, repel = TRUE, cells = colnames(query)) +
AddStyle(title="Original query annotation", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "bottom") +
theme(title = element_text(size = 10)) +
NoGrid()
p2 = Seurat::DimPlot(refquery, group.by="Phase", pt.size=param$pt_size) +
AddStyle(title="Cell cycle phases", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
theme(title = element_text(size = 10)) +
NoGrid()
p3 = suppressMessages(Seurat::FeaturePlot(refquery, features="nCount_RNA")) +
AddStyle(title="nCount_RNA", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
scale_colour_gradient(low="lightgrey", high=param$col, trans="log10") +
theme(title = element_text(size = 10)) +
NoGrid()
p4 = suppressMessages(Seurat::FeaturePlot(refquery, features="nFeature_RNA")) +
AddStyle(title="nFeature_RNA", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
scale_colour_gradient(low="lightgrey", high=param$col, trans="log10") +
theme(title = element_text(size = 10)) +
NoGrid()
p5 = suppressMessages(Seurat::FeaturePlot(refquery, features="percent_mt", cols=c("lightgrey", param$col))) +
AddStyle(title="percent_mt", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
theme(title = element_text(size = 10)) +
NoGrid()
p = p1 + p2 + plot_spacer() + p3 + p4 + p5 +
plot_layout(ncol = 3)
pannot_cells_ref = data.frame(table(scR$annot_cells))
colnames(annot_cells_ref) = c("Cell_type", "Number")
annot_cells_ref = dplyr::mutate(annot_cells_ref, Group = "reference")
annot_cells_ref = dplyr::mutate(annot_cells_ref, Percent = (Number/sum(annot_cells_ref$Number)*100))
annot_cells = dplyr::mutate(annot_cells, Group = "query")
cells_tbl = rbind(annot_cells_ref, annot_cells)
p = ggplot(cells_tbl, aes(x = Cell_type, y = Percent, fill = Group)) +
geom_bar(stat = "identity") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1.05)) +
scale_fill_manual(values=param$col_samples)
p p1 = ggplot(cells_tbl, aes(x = Group, y = Percent, fill = Cell_type )) +
geom_bar(stat = "identity") +
theme_classic(base_size = 10) +
scale_fill_manual(values=param$col_annotation_all)
p2 = ggplot(annot_cells_ref, aes(x = 2, y = Percent, fill = Cell_type )) +
geom_bar(stat="identity", width=1, color = "white") +
coord_polar("y", start = 0) +
geom_text(aes(label = round(Percent,1)), position = position_stack(vjust = 0.5), size = 3, color = "white") +
theme_void() +
xlim(0.8, 2.5) +
scale_fill_manual(values=param$col_annotation_all)
p3 = ggplot(annot_cells, aes(x = 2, y = Percent, fill = Cell_type )) +
geom_bar(stat="identity", width=1, color = "white") +
coord_polar("y", start = 0) +
geom_text(aes(label = round(Percent,1)), position = position_stack(vjust = 0.5), size = 3, color = "white") +
theme_void() +
xlim(0.8, 2.5) +
scale_fill_manual(values=param$col_annotation_all)
p = p1 + p2 + plot_spacer() + p3
p
# Add experiment details
Seurat::Misc(sc, "experiment") = list(project_id=param$project_id, date=Sys.Date(), species=gsub("_gene_ensembl", "", param$mart_dataset))
# Add parameter
Seurat::Misc(sc, "parameters") = param
# Add technical output (note: cannot use Misc function here)
sc@misc$technical = data.frame(ScrnaseqSessionInfo(param$path_to_git))### Export seurat object
saveRDS(sc, file=file.path(param$path_out, "data", "sc.rds"))### Convert data
################################################################################
### Export to Loupe Cell Browser
if (all(param$path_data$type == "10x")) {
# Export reductions (umap, pca, others)
for(r in Seurat::Reductions(sc)) {
write.table(Seurat::Embeddings(sc, reduction=r)[,1:2] %>% as.data.frame() %>% tibble::rownames_to_column(var="Barcode"),
file=file.path(param$path_out, "data", paste0("Loupe_projection_", r, ".csv")), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}
# Export categorical metadata
loupe_meta = sc@meta.data
idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
write.table(x=loupe_meta[, idx_keep] %>% tibble::rownames_to_column(var="barcode"),
file=file.path(param$path_out, "data", "Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}
### Export as andata object (h5ad file)
# Convert Seurat v5 single cell object to anndata object
# Does not work for SCT at the moment
# However, it would contain counts and data layer
#scCustomize::as.anndata(x = sc, file_path = file.path(param$path_out, "data"), file_name = "sc_anndata.h5ad", assay="RNA",
# main_layer = "data", other_layers = "counts", transer_dimreduc = TRUE, verbose = FALSE)
# scesy only worked for v3 and v4 objects
# Convert to V3/4/Assay structure
sc_v3 = scCustomize::Convert_Assay(seurat_object = sc, convert_to = "V3", assay = "RNA")
# Convert Seurat v3 single cell object to anndata object
adata = sceasy::convertFormat(sc_v3, from="seurat", to="anndata", outFile=NULL, assay=Seurat::DefaultAssay(sc_v3))
# Write to h5ad file
adata$write(file.path(param$path_out, "data", "sc_anndata.h5ad"), compression="gzip")
### Export count matrix
invisible(ExportSeuratAssayData(sc,
dir=file.path(param$path_out, "data", "counts"),
assays=param$assay_raw,
slot="counts",
include_cell_metadata_cols=colnames(sc[[]]),
metadata_prefix=paste0(param$project_id, ".")))
### Export metadata
openxlsx::write.xlsx(x=sc[[]] %>% tibble::rownames_to_column(var="Barcode"), file=file.path(param$path_out, "data", "cell_metadata.xlsx"), rowNames=FALSE, colNames=TRUE)We export the assay data, cell metadata, clustering and visualization.
Result files:
* sc.rds: Seurat object for import into R
* cell_metadata.xlsx: Cell metadata in excel format
* counts folder: Contains raw count matrix of the entire dataset (sparse
matrix)
We export the assay data, cell metadata, clustering and visualization in andata format.
Result file:
* sc.h5ad: H5AD object
If all provided datasets are of type “10x”, we export the UMAP 2D visualization, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser (https://support.10xgenomics.com/single-cell-gene-expression/software/visualization/latest/what-is-loupe-cell-browser).
Result files are:
* Loupe_projection_(umap|pca|…).csv: Seurat UMAP/PCA/… projections for
visualization
* Loupe_metadata.csv: Seurat cell meta data including clusters and cell
cycle phases
Projections can be imported in Loupe via “Import Projection”, cell meta data via “Import Categories” and gene sets via “Import Lists”.
# Output reference seurat object
saveRDS(scR, file=file.path(param$path_out, "data", "scR.rds"))
The following parameters were used to run the workflow.
out = ScrnaseqParamsInfo(params=param)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")| Name | Value |
|---|---|
| path_to_git | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis |
| path_to_basic_settings | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/basic_settings.R |
| path_to_advanced_settings | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/advanced_settings.R |
| scriptname | scripts/dataset_mapping/dataset_mapping_seurat.Rmd |
| author | kosankem |
| assay_raw | RNA |
| downsample_cells_equally | FALSE |
| cell_filter | nFeature_RNA:20, NA; nCount_RNA:200, 20000; percent_mt:0, 18; Sample1:nFeature_RNA=c(20, NA), nCount_RNA=c(200, 20000), percent_mt=c(0, 18); Sample2:nFeature_RNA=c(20, NA), nCount_RNA=c(200, 20000), percent_mt=c(0, 18) |
| feature_filter | min_counts:1; min_cells:5; Sample1:min_counts=1, min_cells=5; Sample2:min_counts=1, min_cells=5 |
| samples_min_cells | 10 |
| norm | RNA |
| cc_remove | FALSE |
| cc_remove_all | FALSE |
| cc_rescore_after_merge | TRUE |
| integrate_samples | method:merge; dimensions:30; k_anchor:10; k_weight:100; reference:; integration_function:CCAIntegration |
| experimental_groups | homogene |
| pc_n | 8 |
| cluster_k | 20 |
| umap_k | 30 |
| cluster_resolution_test | 0.5, 0.7, 0.8 |
| cluster_resolution | 0.6 |
| marker_padj | 0.05 |
| marker_log2FC | 1 |
| marker_pct | 0.25 |
| enrichr_site | Enrichr |
| enrichr_padj | 0.05 |
| col_palette_samples | ggsci::pal_igv |
| col_palette_clusters | ggsci::pal_igv |
| col_palette_annotation | ggsci::pal_ucscgb |
| col | #0086b3 |
| col_bg | #D3D3D3 |
| pt_size | 0.5 |
| liana_methods | logfc, natmi, cellphonedb |
| liana_agg_rank_threshold | 0.01 |
| celltype | annotation |
| reduction | umap |
| predicted_score_threshold | 0.9 |
| percent_predicted_cells_threshold | 0.1 |
| project_id | Testdata |
| data | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/cluster_analysis/data/sc.rds |
| download_test_datasets | download_10x_pbmc_1k_healthyDonor_v3Chemistry |
| species | human |
| refdata | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata_2/cluster_analysis/data/sc.rds |
| path_out | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/dataset_mapping_seurat |
| debugging_mode | default_debugging |
| mart_dataset | hsapiens_gene_ensembl |
| mt | ^MT- |
| enrichr_dbs | GO_Biological_Process_2023, WikiPathway_2023_Human, Azimuth_2023, CellMarker_2024 |
| annotation_dbs | BlueprintEncodeData() |
| annot_version | 98 |
| annot_main | ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession |
| mart_attributes | ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description |
| path_reference | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98 |
| reference | hsapiens_gene_ensembl.v98.annot.txt |
| file_annot | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/hsapiens_gene_ensembl.v98.annot.txt |
| file_cc_genes | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/cell_cycle_markers.xlsx |
| path_data | name:Sample1, Sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample1/, /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample2/ |
| col_samples | reference=#5DB1DDFF, query=#802268FF |
| col_clusters | 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF, 5=#466983FF, 6=#BA6338FF, 7=#5DB1DDFF |
| col_samples_ref | sample1=#5050FFFF |
| col_clusters_ref | 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF, 5=#466983FF, 6=#BA6338FF, 7=#5DB1DDFF, 8=#802268FF |
| col_annotation_ref | Monocytes=#FF0000FF, B-cells=#FF9900FF, CD8+ T-cells=#FFCC00FF, CD4+ T-cells=#00FF00FF, NK cells=#6699FFFF |
| col_annotation_all | Monocytes=#FF0000FF, B-cells=#FF9900FF, CD8+ T-cells=#FFCC00FF, CD4+ T-cells=#00FF00FF, NK cells=#6699FFFF, NA=#BA6338FF |
This report was generated using the scripts/dataset_mapping/dataset_mapping_seurat.Rmd script. Software versions were collected at run time.
out = ScrnaseqSessionInfo(param$path_to_git)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))| Name | Value |
|---|---|
| Run on: | Wed Sep 04 02:28:53 PM 2024 |
| ktrns/scrnaseq | 35dd8e070b3e3baff5405ec6fea400c39e724d1c |
| Container | NA |
| R | R version 4.2.1 (2022-06-23) |
| Platform | x86_64-pc-linux-gnu (64-bit) |
| Operating system | Ubuntu 20.04.6 LTS |
| Host name | hpc-rc11 |
| Host OS | #138-Ubuntu SMP Wed Jun 22 15:00:31 UTC 2022 (5.4.0-122-generic) |
| Packages | abind1.4-5, AnnotationDbi1.60.2, AnnotationHub3.6.0, ape5.8, assertthat0.2.1, backports1.4.1, basilisk1.17.2, basilisk.utils1.17.2, beachmat2.14.2, beeswarm0.4.0, bibtex0.5.1, Biobase2.58.0, BiocFileCache2.6.1, BiocGenerics0.44.0, BiocManager1.30.22, BiocNeighbors1.16.0, BiocParallel1.32.6, BiocSingular1.14.0, BiocVersion3.16.0, Biostrings2.66.0, bit4.0.5, bit644.0.5, bitops1.0-7, blob1.2.4, bluster1.8.0, bslib0.6.1, cachem1.0.8, Cairo1.6-2, celldex1.8.0, cellranger1.1.0, checkmate2.3.1, circlize0.4.16, cli3.6.2, clue0.3-65, cluster2.1.3, clustree0.5.1, codetools0.2-18, colorspace2.1-0, ComplexHeatmap2.14.0, cowplot1.1.3, crayon1.5.2, curl5.2.1, data.table1.15.2, DBI1.2.2, dbplyr2.2.1, DelayedArray0.24.0, DelayedMatrixStats1.20.0, deldir2.0-4, digest0.6.34, dir.expiry1.6.0, doParallel1.0.17, dotCall641.1-1, dplyr1.1.4, dqrng0.3.2, edgeR3.40.2, ellipsis0.3.2, enrichR3.2, evaluate0.23, ExperimentHub2.6.0, fansi1.0.6, farver2.1.1, fastDummies1.7.3, fastmap1.1.1, filelock1.0.3, fitdistrplus1.1-11, forcats1.0.0, foreach1.5.2, future1.33.1, future.apply1.11.1, generics0.1.3, GenomeInfoDb1.34.9, GenomeInfoDbData1.2.9, GenomicRanges1.50.2, GetoptLong1.0.5, ggbeeswarm0.7.2, ggforce0.4.2, ggplot23.5.0, ggprism1.0.5, ggraph2.2.1, ggrastr1.0.2, ggrepel0.9.5, ggridges0.5.6, ggsci3.0.1, GlobalOptions0.1.2, globals0.16.2, glue1.7.0, goftest1.2-3, graphlayouts1.1.1, gridExtra2.3, gridGraphics0.5-1, gtable0.3.4, highr0.10, hms1.1.3, htmltools0.5.7, htmlwidgets1.6.4, httpuv1.6.14, httr1.4.7, ica1.0-3, igraph2.0.2, interactiveDisplayBase1.36.0, IRanges2.32.0, irlba2.3.5.1, iterators1.0.14, janitor2.2.0, jquerylib0.1.4, jsonlite1.8.8, kableExtra1.4.0, KEGGREST1.38.0, KernSmooth2.23-20, knitcitations1.0.12, knitr1.45, labeling0.4.3, later1.3.2, lattice0.20-45, lazyeval0.2.2, leiden0.4.3.1, liana0.1.14, lifecycle1.0.4, limma3.54.2, listenv0.9.1, lmtest0.9-40, locfit1.5-9.9, logger0.3.0, lubridate1.9.3, magrittr2.0.3, MASS7.3-58, MAST1.27.1, Matrix1.6-5, MatrixGenerics1.10.0, matrixStats1.1.0, memoise2.0.1, metapod1.6.0, mime0.12, miniUI0.1.1.1, munsell0.5.0, nlme3.1-157, OmnipathR3.13.22, openxlsx4.2.5.2, paletteer1.6.0, parallelly1.37.1, patchwork1.2.0, pbapply1.7-2, pheatmap1.0.12, pillar1.9.0, pkgconfig2.0.3, plotly4.10.4, plyr1.8.9, png0.1-8, polyclip1.10-6, prettyunits1.2.0, progress1.2.3, progressr0.14.0, promises1.2.1, purrr1.0.2, R.methodsS31.8.2, R.oo1.26.0, R.utils2.12.3, R62.5.1, ragg1.2.7, RANN2.6.1, rappdirs0.3.3, RColorBrewer1.1-3, Rcpp1.0.12, RcppAnnoy0.0.22, RcppHNSW0.6.0, RCurl1.98-1.14, readr2.1.5, readxl1.4.3, RefManageR1.4.0, rematch22.1.2, remotes2.4.2.1, renv0.16.0, reshape21.4.4, reticulate1.35.0, rjson0.2.21, rlang1.1.3, rmarkdown2.25, ROCR1.0-11, RSpectra0.16-1, RSQLite2.3.5, rstudioapi0.15.0, rsvd1.0.5, Rtsne0.17, rvest1.0.4, S4Vectors0.36.2, sass0.4.8, ScaledMatrix1.6.0, scales1.3.0, scater1.26.1, scattermore1.2, scCustomize2.1.2, sceasy0.0.7, scran1.26.2, sctransform0.4.1, scuttle1.8.4, sessioninfo1.2.2, Seurat5.0.2, SeuratObject5.0.1, shape1.4.6.1, shiny1.8.0, SingleCellExperiment1.20.1, SingleR2.0.0, snakecase0.11.1, sp2.1-3, spam2.10-0, sparseMatrixStats1.10.0, spatstat.data3.0-4, spatstat.explore3.2-6, spatstat.geom3.2-9, spatstat.random3.2-3, spatstat.sparse3.0-3, spatstat.utils3.0-4, statmod1.5.0, stringi1.8.3, stringr1.5.1, SummarizedExperiment1.28.0, survival3.2-13, svglite2.1.3, systemfonts1.0.5, tensor1.5, textshaping0.3.7, tibble3.2.1, tidygraph1.3.1, tidyr1.3.1, tidyselect1.2.0, timechange0.3.0, tweenr2.0.3, tzdb0.4.0, utf81.2.4, uwot0.1.16, vctrs0.6.5, vipor0.4.7, viridis0.6.5, viridisLite0.4.2, withr3.0.0, WriteXLS6.7.0, xfun0.41, XML3.99-0.16.1, xml21.3.6, xtable1.8-4, XVector0.38.0, yaml2.3.8, zip2.3.1, zlibbioc1.44.0, zoo1.8-12 |
This Workflow was developed as module of the sc_analysis workflow at the Research Core Unit Genomics, Hannover Medical School (Hannover, Germany). Seurat Vignettes were initially used as templates (Hao et al. (2023), Hao et al. (2021)).
# Writes knitcitations references to references.bib file.
knitcitations::write.bibtex(file=file.path(param$path_out, "references.bib"))